Loading/Cleaning Data and Exploratory Analysis

# read in the data
nndb_flat <- read.csv("nndb_flat.csv")

# filter the data to contain only food groups with limited variables
nndb <- nndb_flat %>%
  filter(FoodGroup %in% c("Vegetables and Vegetable Products", "Beef Products", "Sweets")) %>% 
  select("Energy_kcal" : "Zinc_mg")

# examine the correlation among the variables
ggcorr(nndb)

According to this correlation plot, high correlations can be observed among Folate_mcg, Thiamin_mg, Niacin_mg, and Riboflavin_mg. Other high positive correlations include VitA_mcg and Manganese_mg, Protein_g and Zinc_mg etc.

Performing PCA

# scale the data
nndb_scaled <- scale(nndb)

# perform PCA on the data
pca_nndb <- prcomp(nndb_scaled, 
                   center = FALSE, 
                   scale. = FALSE)

summary(pca_nndb)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6    PC7
## Standard deviation     2.3024 1.8455 1.6577 1.6205 1.41770 1.03167 0.9922
## Proportion of Variance 0.2305 0.1481 0.1195 0.1142 0.08739 0.04628 0.0428
## Cumulative Proportion  0.2305 0.3785 0.4980 0.6122 0.69960 0.74587 0.7887
##                            PC8     PC9   PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.90473 0.85456 0.8279 0.74550 0.71398 0.61241 0.58307
## Proportion of Variance 0.03559 0.03175 0.0298 0.02416 0.02216 0.01631 0.01478
## Cumulative Proportion  0.82426 0.85602 0.8858 0.90998 0.93214 0.94845 0.96323
##                           PC15    PC16    PC17    PC18    PC19    PC20    PC21
## Standard deviation     0.46385 0.42317 0.38016 0.31599 0.27229 0.23596 0.22717
## Proportion of Variance 0.00935 0.00779 0.00628 0.00434 0.00322 0.00242 0.00224
## Cumulative Proportion  0.97258 0.98037 0.98665 0.99099 0.99422 0.99664 0.99888
##                          PC22    PC23
## Standard deviation     0.1440 0.07045
## Proportion of Variance 0.0009 0.00022
## Cumulative Proportion  0.9998 1.00000
# figure out how many components we need
var_explained_df <- data.frame(PC = 1:23, 
                               var_explained = summary(pca_nndb)$importance[3,])

head(var_explained_df)
##     PC var_explained
## PC1  1       0.23048
## PC2  2       0.37855
## PC3  3       0.49803
## PC4  4       0.61221
## PC5  5       0.69960
## PC6  6       0.74587
# draw a scree plot to show the cumulative proportion of the variance explained by each PC
var_explained_df %>%
  ggplot(aes(x = PC, 
             y = var_explained, 
             group = 1)) +
  geom_point() + 
  geom_line() + 
  labs(title="Scree plot: PCA",
       y = 'Cumulative proportion of variance explained',
       x = '')

# make 3 separate plots for the loadings of the first 3 PCs for all of the variables

# plot 1
pca_nndb_loadings1 <- as.data.frame(pca_nndb$rotation) %>% 
  select(PC1) %>% 
  mutate(variable = rownames(pca_nndb$rotation)) %>%
  rename(loadings = PC1) %>%
  arrange(desc(abs(loadings)))

ggplot(pca_nndb_loadings1, 
       aes(x = reorder(variable, 
                       abs(loadings)), 
           y = loadings)) + 
  geom_bar(stat = 'identity') + 
  labs(x = "Variable",
       y = "Loadings", 
       title = "The loadings for PC1") + 
  theme(axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

# plot 2
pca_nndb_loadings2 <- as.data.frame(pca_nndb$rotation) %>% 
  select(PC2) %>% 
  mutate(variable = rownames(pca_nndb$rotation)) %>%
  rename(loadings = PC2) %>%
  arrange(desc(abs(loadings)))

ggplot(pca_nndb_loadings2, 
       aes(x = reorder(variable, 
                       abs(loadings)), 
           y = loadings)) + 
  geom_bar(stat = 'identity') + 
  labs(x = "Variable",
       y = "Loadings", 
       title = "The loadings for PC2") + 
  theme(axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

# plot 3
pca_nndb_loadings3 <- as.data.frame(pca_nndb$rotation) %>% 
  select(PC3) %>% 
  mutate(variable = rownames(pca_nndb$rotation)) %>%
  rename(loadings = PC3) %>%
  arrange(desc(abs(loadings)))

ggplot(pca_nndb_loadings3, 
       aes(x = reorder(variable, 
                       abs(loadings)), 
           y = loadings)) + 
  geom_bar(stat = 'identity') + 
  labs(x = "Variable",
       y = "Loadings", 
       title = "The loadings for PC1") + 
  theme(axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

# calculate pca scores
nndb2 <- nndb_flat %>%
  filter(FoodGroup %in% c("Vegetables and Vegetable Products", "Beef Products", "Sweets")) %>% 
  select("Energy_kcal" : "Zinc_mg", FoodGroup)

pca_scores <- as.data.frame(pca_nndb$x)

pca_scores <- pca_scores %>% 
  mutate(Foodgroup = nndb2$FoodGroup)

head(pca_scores)
##          PC1       PC2       PC3        PC4       PC5        PC6        PC7
## 1 -1.5612574 0.1937294 1.1700818 -0.7998864 0.6148276 -0.5457873 -0.2069789
## 2 -1.3007821 0.8347198 0.8815545 -0.6019031 1.3214529  0.2214579 -0.6822156
## 3 -1.3432660 0.7163900 0.7840115 -0.6382599 1.3465715  0.1562026 -0.6526562
## 4 -0.9146238 0.3311983 0.5176899 -0.3154010 0.6263561 -0.7459331  0.2627309
## 5 -1.1155612 0.1624408 0.6605511 -0.4675455 0.5507415 -0.6857603  0.2328749
## 6 -1.4313712 0.9494195 0.6796587 -0.3914938 1.2217494 -0.7864057 -0.1050359
##          PC8         PC9        PC10         PC11       PC12        PC13
## 1  0.1524726 -0.09007289 -0.17746633 -0.006238066 0.26035911 -0.13734436
## 2 -0.2738827  0.41480350 -1.15353346 -0.834741903 0.30889039  0.55405858
## 3 -0.3294967  0.41116129 -1.12053038 -0.750164431 0.43450453  0.51107986
## 4 -0.2598225  0.13668647  0.03091109  0.085235135 0.73926198  0.81665649
## 5 -0.3205132  0.07326368 -0.05649271  0.383306591 0.97061799  0.66661165
## 6  0.1061583  0.01578496 -0.05514659  0.526141035 0.06173713 -0.07221577
##         PC14        PC15        PC16        PC17         PC18        PC19
## 1  0.1447197 -0.01319467  0.25343790 -0.05399305 -0.063357584  0.14331689
## 2  0.2175621 -0.15177657  0.13224277 -0.03502885  0.007560999  0.03636869
## 3  0.1581587 -0.17852753  0.16240197 -0.03885139  0.028064327  0.04698692
## 4 -0.5076588  0.01783758 -0.22142992 -0.01680315 -0.060215737 -0.04360432
## 5 -0.3400429 -0.15117768 -0.06695674 -0.02667881 -0.108876946  0.01367921
## 6  0.3616194 -0.11685329  0.19543874  0.08548548 -0.074980182 -0.11627266
##          PC20        PC21        PC22          PC23
## 1 -0.03535393  0.02027412 -0.02599203  0.0095088000
## 2  0.14105445  0.03600869 -0.24366859  0.0039909754
## 3  0.02773533 -0.00322706 -0.15091874 -0.0007238741
## 4 -0.08555712  0.04233724  0.21186150 -0.0098150379
## 5 -0.09059414  0.01726881  0.21349082 -0.0117930981
## 6  0.10389198  0.14936890 -0.05919124 -0.0013144711
##                           Foodgroup
## 1 Vegetables and Vegetable Products
## 2 Vegetables and Vegetable Products
## 3 Vegetables and Vegetable Products
## 4 Vegetables and Vegetable Products
## 5 Vegetables and Vegetable Products
## 6 Vegetables and Vegetable Products
# PC1 versus PC2
plot1 <- ggplot(pca_scores, 
                aes(x = PC1, 
                    y = PC2, 
                    col = Foodgroup)) + 
  geom_point() + 
  labs(title = "PC1 versus PC2")
ggplotly(plot1)
# PC1 versus PC3
plot2 <- ggplot(pca_scores, 
                aes(x = PC1, 
                    y = PC3, 
                    col = Foodgroup)) + 
  geom_point() + 
  labs(title = "PC1 versus PC3")
ggplotly(plot2)
# PC2 versus PC3
plot3 <- ggplot(pca_scores, 
                aes(x = PC2, 
                    y = PC3, 
                    col = Foodgroup)) + 
  geom_point() +
  labs(title = "PC2 versus PC3")
ggplotly(plot3)

The Vegetables and Vegetable Products is the outlier.

Identify Outlier and Performing PCA Again

nndb <- nndb_flat %>%
  filter(FoodGroup %in% c("Vegetables and Vegetable Products", "Beef Products", "Sweets")) %>% 
  select("Energy_kcal" : "Zinc_mg") %>%
  slice(-c(2108))

# scale the data
nndb_scaled <- scale(nndb)

# perform PCA on the data
pca_nndb <- prcomp(nndb_scaled, 
                   center = FALSE, 
                   scale. = FALSE)

summary(pca_nndb)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     2.4047 1.9174 1.6360 1.5452 1.05222 1.02786 0.92913
## Proportion of Variance 0.2514 0.1598 0.1164 0.1038 0.04814 0.04593 0.03753
## Cumulative Proportion  0.2514 0.4113 0.5276 0.6314 0.67959 0.72552 0.76306
##                            PC8    PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.91925 0.8348 0.78281 0.76116 0.72114 0.70611 0.59976
## Proportion of Variance 0.03674 0.0303 0.02664 0.02519 0.02261 0.02168 0.01564
## Cumulative Proportion  0.79980 0.8301 0.85674 0.88193 0.90454 0.92621 0.94185
##                           PC15    PC16    PC17    PC18    PC19    PC20    PC21
## Standard deviation     0.58556 0.51769 0.43258 0.42121 0.35832 0.31552 0.27864
## Proportion of Variance 0.01491 0.01165 0.00814 0.00771 0.00558 0.00433 0.00338
## Cumulative Proportion  0.95676 0.96841 0.97655 0.98426 0.98985 0.99417 0.99755
##                           PC22    PC23
## Standard deviation     0.22669 0.07037
## Proportion of Variance 0.00223 0.00022
## Cumulative Proportion  0.99978 1.00000
# figure out how many components we need
var_explained_df <- data.frame(PC = 1:23, 
                               var_explained = summary(pca_nndb)$importance[3,])

head(var_explained_df)
##     PC var_explained
## PC1  1       0.25142
## PC2  2       0.41127
## PC3  3       0.52764
## PC4  4       0.63145
## PC5  5       0.67959
## PC6  6       0.72552
# draw a scree plot to show the cumulative proportion of the variance explained by each PC
var_explained_df %>%
  ggplot(aes(x = PC, 
             y = var_explained, 
             group = 1)) +
  geom_point() + 
  geom_line() + 
  labs(title="Scree plot: PCA",
       y = 'Cumulative proportion of variance explained',
       x = '')

# make 3 separate plots for the loadings of the first 3 PCs for all of the variables

# plot 1
pca_nndb_loadings1 <- as.data.frame(pca_nndb$rotation) %>% 
  select(PC1) %>% 
  mutate(variable = rownames(pca_nndb$rotation)) %>%
  rename(loadings = PC1) %>%
  arrange(desc(abs(loadings)))

ggplot(pca_nndb_loadings1, 
       aes(x = reorder(variable, 
                       abs(loadings)), 
           y = loadings)) + 
  geom_bar(stat = 'identity') + 
  labs(x = "Variable",
       y = "Loadings", 
       title = "The loadings for PC1") + 
  theme(axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

# plot 2
pca_nndb_loadings2 <- as.data.frame(pca_nndb$rotation) %>% 
  select(PC2) %>% 
  mutate(variable = rownames(pca_nndb$rotation)) %>%
  rename(loadings = PC2) %>%
  arrange(desc(abs(loadings)))

ggplot(pca_nndb_loadings2, 
       aes(x = reorder(variable, 
                       abs(loadings)), 
           y = loadings)) + 
  geom_bar(stat = 'identity') +
  labs(x = "Variable",
       y = "Loadings", 
       title = "The loadings for PC2") + 
  theme(axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

# plot 3
pca_nndb_loadings3 <- as.data.frame(pca_nndb$rotation) %>% 
  select(PC3) %>% 
  mutate(variable = rownames(pca_nndb$rotation)) %>%
  rename(loadings = PC3) %>%
  arrange(desc(abs(loadings)))

ggplot(pca_nndb_loadings3, 
       aes(x = reorder(variable, 
                       abs(loadings)), 
           y = loadings)) + 
  geom_bar(stat = 'identity') + 
  labs(x = "Variable",
       y = "Loadings", 
       title = "The loadings for PC3") + 
  theme(axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

# calculate pca scores
nndb2 <- nndb_flat %>%
  filter(FoodGroup %in% c("Vegetables and Vegetable Products", "Beef Products", "Sweets")) %>% 
  select("Energy_kcal" : "Zinc_mg", FoodGroup) %>%
  slice(-c(2108))

pca_scores <- as.data.frame(pca_nndb$x)

pca_scores <- pca_scores %>% 
  mutate(Foodgroup = nndb2$FoodGroup)

head(pca_scores)
##          PC1       PC2       PC3        PC4       PC5        PC6         PC7
## 1 -1.7027718 0.2607578 1.2860692 -0.8668904 0.4841448  0.2741745 -0.02544208
## 2 -1.5052871 1.2300725 1.0861992 -1.4691289 0.5803701 -0.6902443  0.55954856
## 3 -1.5439091 0.9445542 1.0367858 -1.2475228 0.6285126 -0.7195206  0.32270083
## 4 -0.8775346 0.5019583 0.4594099 -0.7565559 0.1531686  0.6975467 -0.29997996
## 5 -1.1404633 0.2379715 0.6655091 -0.6574418 0.1887129  0.6061557 -0.25116044
## 6 -1.5987481 1.2533865 0.7337843 -1.3253662 0.8309932  0.3283124 -0.05066289
##           PC8        PC9        PC10       PC11       PC12        PC13
## 1 -0.27230017 -0.1341082 -0.05507301 -0.1025362  0.1932623  0.09842453
## 2 -0.11789083 -1.3603008 -0.35910732 -0.5405831 -0.4009605 -0.96196729
## 3  0.13514244 -1.2581021 -0.35096053 -0.3340014  0.0610447 -0.76157568
## 4  0.25658976 -0.0177702 -0.23923311 -0.6799443  0.5433446  0.94847310
## 5  0.30133009 -0.1037168  0.08390061 -0.6953898  0.7044298  0.99451103
## 6 -0.05182628 -0.1072654  0.65232931 -0.1994536 -0.4172233 -0.04412458
##         PC14        PC15       PC16        PC17        PC18        PC19
## 1 -0.2011960  0.02647699 0.17828992 -0.14199021  0.26892752  0.05198769
## 2  0.3356937 -0.30302057 0.21382488 -0.10268721  0.06832141  0.13057971
## 3  0.4687941 -0.23202731 0.36386401 -0.12280256  0.06295249  0.07041284
## 4  0.8219214 -0.19841998 0.05424900  0.19168445 -0.13419691 -0.10533007
## 5  0.6173174 -0.21761361 0.34030427  0.08109275 -0.04898600 -0.09887924
## 6 -0.2725223  0.09566712 0.05093138 -0.23563155  0.04788329  0.01674507
##          PC20        PC21         PC22         PC23
## 1 -0.02388626 -0.07661584  0.045962709 -0.008914489
## 2  0.06830207 -0.21587178  0.057060265 -0.005327082
## 3  0.05518188 -0.17597528  0.041799704 -0.002353218
## 4 -0.06851258  0.24429998  0.001351966  0.012080095
## 5 -0.11980030  0.23091714 -0.013138326  0.013051678
## 6 -0.02876194  0.11643664  0.100532203  0.002740541
##                           Foodgroup
## 1 Vegetables and Vegetable Products
## 2 Vegetables and Vegetable Products
## 3 Vegetables and Vegetable Products
## 4 Vegetables and Vegetable Products
## 5 Vegetables and Vegetable Products
## 6 Vegetables and Vegetable Products
# PC1 versus PC2
plot1 <- ggplot(pca_scores, 
                aes(x = PC1, 
                    y = PC2, 
                    col = Foodgroup)) + 
  geom_point() + 
  labs(title = "PC1 versus PC2")
ggplotly(plot1)
# PC1 versus PC3
plot2 <- ggplot(pca_scores, 
                aes(x = PC1, 
                    y = PC3, 
                    col = Foodgroup)) + 
  geom_point() + 
  labs(title = "PC1 versus PC3")
ggplotly(plot2)
# PC2 versus PC3
plot3 <- ggplot(pca_scores, 
                aes(x = PC2, 
                    y = PC3, 
                    col = Foodgroup)) + 
  geom_point() + 
  labs(title = "PC2 versus PC3")
ggplotly(plot3)

Comments: After removing the outlier, the loadings for PC2 changed the most compared to PC1 and PC3. This is because the dataset changes the most variance towards the direction of PC2 after removing the outlier.

Description: According to the plots of the scores, the three food groups can be distinguished better after removing the outlier. Loadings of the PCs represent the weight of each variable on the corresponding directions. Since removing the outlier changes the loadings for PCs, it is reasonable to see the plots of the scores change as well.